load packages

library(tidyverse)
library(glmnet)
library(caTools)
library(caret)
library(ROCR)

load coded data

coded = read.csv("~/Documents/code/dsnlab/automotion-test-set/FP_manuallyCoded_edited2.csv") %>%
  select(subjectID, run, volume, trash) %>%
  rename("artifact" = trash)

load stripe detection data

fileDir = "~/Documents/code/dsnlab/automotion-test-set/output/FP/"
filePattern = "FP_stripes_.*.csv"
  
file_list = list.files(fileDir, pattern = filePattern)

for (file in file_list){
  # if the merged dataset doesn't exist, create it
  if (!exists("stripes")){
    temp = read.csv(paste0(fileDir,file))
    stripes = data.frame(temp) %>% 
      rename("volume" = t,
             "subjectID" = subject) %>%
      select(-file)
    rm(temp)
  }
  
  # if the merged dataset does exist, append to it
  else {
    temp_dataset = read.csv(paste0(fileDir,file))
    temp_dataset = data.frame(temp_dataset) %>% 
      rename("volume" = t,
             "subjectID" = subject) %>%
      select(-file)
    stripes = rbind(stripes, temp_dataset)
    rm(temp_dataset)
  }
}

load global intensities and rps

# define paths and variables
rpDir = '~/Documents/code/dsnlab/automotion-test-set/FP_rp_txt/'
# outputDir = '~/Documents/code/tds_auto-motion/auto-motion-output/'
# plotDir = '~/Documents/code/tds_auto-motion/auto-motion-output/plots/'
study = "FP"
rpPattern = '^rp_(FP[0-9]{3})_(.*).txt'
rpCols = c("euclidian_trans","euclidian_rot","euclidian_trans_deriv","euclidian_rot_deriv","trash.rp")

# global intensities
intensities = read.csv(paste0('~/Documents/code/dsnlab/automotion-test-set/',study,'_globalIntensities.csv'))

# rp files
file_list = list.files(rpDir, pattern = rpPattern)

for (file in file_list){
  # if the merged dataset doesn't exist, create it
  if (!exists("rp")){
    temp = read.table(paste0(rpDir,file))
    colnames(temp) = rpCols
    rp = data.frame(temp, file = rep(file,count(temp))) %>% 
      mutate(volume = row_number()) %>%
      extract(file,c("subjectID","run"), rpPattern)
    rm(temp)
  }
  
  # if the merged dataset does exist, append to it
  else {
    temp_dataset = read.table(paste0(rpDir,file))
    colnames(temp_dataset) = rpCols
    temp_dataset = data.frame(temp_dataset, file = rep(file,count(temp_dataset))) %>% 
      mutate(volume = row_number()) %>%
      extract(file,c("subjectID","run"), rpPattern)
    rp = rbind(rp, temp_dataset)
    rm(temp_dataset)
  }
}

join dataframes

joined = left_join(stripes, coded, by = c("subjectID", "run", "volume")) %>%
  left_join(., intensities, by = c("subjectID", "run", "volume")) %>%
  left_join(., rp, by = c("subjectID", "run", "volume")) %>%
  mutate(tile = paste0("tile_",tile)) %>%
  group_by(subjectID, run, tile) %>%
  mutate(Diff.mean = volMean - lag(volMean),
         Diff.sd = volSD - lag(volSD)) %>%
  spread(tile, freqtile_power)

load models

setwd("~/Documents/code/dsnlab/automotion-test-set")
model_log = readRDS("model_log_TDS.rds")
model_svm = readRDS("model_svm_TDS.rds")

split the data

set.seed(101) 
sample = sample.split(joined$artifact, SplitRatio = .75)
training = subset(joined, sample == TRUE)
testing = subset(joined, sample == FALSE)

machine learning

tidy data

ml = joined %>%
  group_by(subjectID, run) %>%
  mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
         Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
  gather(tile,freqtile_power, starts_with("tile")) %>%
  mutate(tile = paste0(tile,"_c")) %>%
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  select(-trash.rp, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
  select(subjectID, run, volume, artifact, everything())

logistic regression

x = as.matrix(ml[,-c(1,2,3,4)])
y = as.double(as.matrix(ml[, 4]))
pred = predict(model_log, newx = x, s=model_log$lambda.1se, type="response")
pred[pred > .03] = 1
pred[pred < .03] = 0

svm

full_svm = ml  %>%
  mutate(artifact = ifelse(artifact == 1, "yes","no"),
         artifact = as.factor(artifact))

# re-run svm on full sample
full_pred = predict(model_svm, newdata = full_svm, type="prob") %>%
  select(-no)
full_pred =  as.matrix(full_pred)
full_pred[full_pred > .07] = "yes"
full_pred[full_pred < .07] = "no"

auto-motion process

man = joined %>%
  gather(tile, freqtile_power, starts_with("tile")) %>%
  filter(tile %in% c("tile_1", "tile_10")) %>%
  
  # code trash based on mean, sd, and rp
  ungroup %>%
  mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
         sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
         meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
         sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
         
         # code volumes above mean thresholds as trash
         upper.mean = meanDiff.mean + 2*sdDiff.mean,
         lower.mean = meanDiff.mean - 2*sdDiff.mean,
         trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
         trash.mean = ifelse(is.na(Diff.mean),0,trash.mean),
         
         upper.sd = meanDiff.sd + 2*sdDiff.sd,
         lower.sd = meanDiff.sd - 2*sdDiff.sd,
         trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
         trash.sd = ifelse(is.na(Diff.sd),0,trash.sd),
         
         # code volumes with more than +/- .25mm translation or rotation in Euclidian distance
         trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
         trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
  select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
  
  # code trash based on striping
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
  
  # combine trash
  mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
         trash.sum = trash.rp.tr + trash.rp.rot + trash.mean + trash.sd + trash.stripe,
         trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%

  # recode as trash if volume behind and in front are both marked as trash
  mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
         
  # code first volume as trash if second volume is trash
  mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%

  # code hits
  mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
                ifelse(trash.combined == 0 & (artifact == 1), "neg",
                ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits)) %>%
  gather(tile, freqtile_power_c, c("tile_1", "tile_10"))

confusion matrices

logistic regression

confusionMatrix(pred, y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 21709   197
##          1   518   356
##                                              
##                Accuracy : 0.9686             
##                  95% CI : (0.9663, 0.9708)   
##     No Information Rate : 0.9757             
##     P-Value [Acc > NIR] : 1                  
##                                              
##                   Kappa : 0.4836             
##  Mcnemar's Test P-Value : <0.0000000000000002
##                                              
##             Sensitivity : 0.9767             
##             Specificity : 0.6438             
##          Pos Pred Value : 0.9910             
##          Neg Pred Value : 0.4073             
##              Prevalence : 0.9757             
##          Detection Rate : 0.9530             
##    Detection Prevalence : 0.9616             
##       Balanced Accuracy : 0.8102             
##                                              
##        'Positive' Class : 0                  
## 

svm

confusionMatrix(full_pred, full_svm$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  21877   209
##        yes   350   344
##                                           
##                Accuracy : 0.9755          
##                  95% CI : (0.9734, 0.9774)
##     No Information Rate : 0.9757          
##     P-Value [Acc > NIR] : 0.6126          
##                                           
##                   Kappa : 0.5393          
##  Mcnemar's Test P-Value : 0.000000003193  
##                                           
##             Sensitivity : 0.9843          
##             Specificity : 0.6221          
##          Pos Pred Value : 0.9905          
##          Neg Pred Value : 0.4957          
##              Prevalence : 0.9757          
##          Detection Rate : 0.9604          
##    Detection Prevalence : 0.9695          
##       Balanced Accuracy : 0.8032          
##                                           
##        'Positive' Class : no              
## 

manual

man1 = man %>%
  filter(tile == "tile_1")
confusionMatrix(man1$trash.combined, man1$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 22149   191
##          1    78   362
##                                                
##                Accuracy : 0.9882               
##                  95% CI : (0.9867, 0.9896)     
##     No Information Rate : 0.9757               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7231               
##  Mcnemar's Test P-Value : 0.000000000008565    
##                                                
##             Sensitivity : 0.9965               
##             Specificity : 0.6546               
##          Pos Pred Value : 0.9915               
##          Neg Pred Value : 0.8227               
##              Prevalence : 0.9757               
##          Detection Rate : 0.9723               
##    Detection Prevalence : 0.9807               
##       Balanced Accuracy : 0.8256               
##                                                
##        'Positive' Class : 0                    
## 

plot and compare

join and plot models

# logistic regression
data.plot.log = bind_cols(ml, as.data.frame(y), as.data.frame(pred)) %>%
  mutate(hits = ifelse(y == 1 & `1` == 1, "hit",
                ifelse(y == 0 & `1` == 1, "pos",
                ifelse(y == 1 & `1` == 0, "neg", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits))

# svm
data.plot.svm = bind_cols(full_svm, as.data.frame(full_pred)) %>%
  rename("full_pred" = yes) %>%
  mutate(hits = ifelse(artifact == "yes" & full_pred == "yes", "hit",
                ifelse(artifact == "no" & full_pred == "yes", "pos",
                ifelse(artifact == "yes" & full_pred == "no", "neg", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits))

# plot and compare models
plot.comp = man %>%
  filter(tile == "tile_1") %>%
  select(subjectID, run, volume, euclidian_trans_deriv, hits) %>%
  rename("auto" = hits) %>%
  left_join(data.plot.log, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
  select(subjectID, run, volume, euclidian_trans_deriv, auto, hits) %>%
  rename("log" = hits) %>%
  left_join(data.plot.svm, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
  select(subjectID, run, volume, euclidian_trans_deriv, auto, log, hits) %>%
  rename("svm" = hits) %>%
  gather(model, hits, c("auto", "log", "svm")) %>%
  mutate(label = ifelse(regexpr('.*', hits), as.character(volume), ''))

ggplot(plot.comp, aes(x = volume, y = euclidian_trans_deriv)) +
  geom_line(size = .25) +
  geom_point(data = subset(plot.comp, !is.na(hits)), aes(color = hits), size = 2.5) +
  geom_text(aes(label = label), size = 1.5) +
  facet_wrap(~ subjectID + run + model, ncol = 9, scales = "free") +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
  theme(axis.text.x = element_text(size = 6))